library(readr)
library(dplyr)
library(tidyr)
library(tcltk)
characters = read_csv("got/data/characters.csv")
episodes = read_csv("got/data/episodes.csv")
scenes = read_csv("got/data/scenes.csv")
appearances = read_csv("got/data/appearances.csv")summary(characters) name sex house killedBy
Length:587 Length:587 Length:587 Length:587
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
image
Length:587
Class :character
Mode :character
str(characters)tibble [587 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ name : chr [1:587] "Addam Marbrand" "Adrack Humble" "Aeron Greyjoy" "Aerys Targaryen" ...
$ sex : chr [1:587] "male" "male" "male" "male" ...
$ house : chr [1:587] NA NA "Greyjoy" NA ...
$ killedBy: chr [1:587] NA NA NA NA ...
$ image : chr [1:587] NA NA "https://images-na.ssl-images-amazon.com/images/M/MV5BNzI5MDg0ZDAtN2Y2ZC00MzU1LTgyYjQtNTBjYjEzODczZDVhXkEyXkFqcG"| __truncated__ NA ...
- attr(*, "spec")=
.. cols(
.. name = col_character(),
.. sex = col_character(),
.. house = col_character(),
.. killedBy = col_character(),
.. image = col_character()
.. )
summary(episodes) episodeTitle episodeNum seasonNum episodeId total_duration
Length:73 Min. : 1.000 Min. :1.000 Min. : 1 Min. :2696
Class :character 1st Qu.: 3.000 1st Qu.:2.000 1st Qu.:19 1st Qu.:2969
Mode :character Median : 5.000 Median :4.000 Median :37 Median :3142
Mean : 5.192 Mean :4.205 Mean :37 Mean :3204
3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:55 3rd Qu.:3279
Max. :10.000 Max. :8.000 Max. :73 Max. :4510
str(episodes)tibble [73 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ episodeTitle : chr [1:73] "Winter Is Coming" "The Kingsroad" "Lord Snow" "Cripples, Bastards, and Broken Things" ...
$ episodeNum : num [1:73] 1 2 3 4 5 6 7 8 9 10 ...
$ seasonNum : num [1:73] 1 1 1 1 1 1 1 1 1 1 ...
$ episodeId : num [1:73] 1 2 3 4 5 6 7 8 9 10 ...
$ total_duration: num [1:73] 3509 3142 3257 3164 3073 ...
- attr(*, "spec")=
.. cols(
.. episodeTitle = col_character(),
.. episodeNum = col_double(),
.. seasonNum = col_double(),
.. episodeId = col_double(),
.. total_duration = col_double()
.. )
summary(scenes) sceneStart sceneEnd location subLocation
Length:3840 Length:3840 Length:3840 Length:3840
Class1:hms Class1:hms Class :character Class :character
Class2:difftime Class2:difftime Mode :character Mode :character
Mode :numeric Mode :numeric
episodeId duration nbc sceneId nbdeath
Min. : 1.00 Min. : 1.00 Min. : 1.000 Min. : 1.0 Min. :0.00000
1st Qu.:27.00 1st Qu.: 14.00 1st Qu.: 2.000 1st Qu.: 960.8 1st Qu.:0.00000
Median :46.00 Median : 36.00 Median : 2.000 Median :1920.5 Median :0.00000
Mean :44.18 Mean : 60.91 Mean : 3.155 Mean :1920.5 Mean :0.09792
3rd Qu.:65.00 3rd Qu.: 86.00 3rd Qu.: 4.000 3rd Qu.:2880.2 3rd Qu.:0.00000
Max. :73.00 Max. :661.00 Max. :24.000 Max. :3840.0 Max. :7.00000
str(scenes)tibble [3,840 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ sceneStart : 'hms' num [1:3840] 00:00:40 00:01:45 00:03:24 00:03:31 ...
..- attr(*, "units")= chr "secs"
$ sceneEnd : 'hms' num [1:3840] 00:01:45 00:03:24 00:03:31 00:03:38 ...
..- attr(*, "units")= chr "secs"
$ location : chr [1:3840] "The Wall" "North of the Wall" "North of the Wall" "North of the Wall" ...
$ subLocation: chr [1:3840] "Castle Black" "The Haunted Forest" "The Haunted Forest" "The Haunted Forest" ...
$ episodeId : num [1:3840] 1 1 1 1 1 1 1 1 1 1 ...
$ duration : num [1:3840] 65 99 7 7 112 5 7 10 23 18 ...
$ nbc : num [1:3840] 3 3 2 1 3 1 2 1 3 1 ...
$ sceneId : num [1:3840] 1 2 3 4 5 6 7 8 9 10 ...
$ nbdeath : num [1:3840] 0 0 1 0 0 0 0 0 1 0 ...
- attr(*, "spec")=
.. cols(
.. sceneStart = col_time(format = ""),
.. sceneEnd = col_time(format = ""),
.. location = col_character(),
.. subLocation = col_character(),
.. episodeId = col_double(),
.. duration = col_double(),
.. nbc = col_double(),
.. sceneId = col_double(),
.. nbdeath = col_double()
.. )
summary(appearances) name sceneId
Length:12114 Min. : 1
Class :character 1st Qu.: 937
Mode :character Median :1842
Mean :1859
3rd Qu.:2820
Max. :3840
str(appearances)tibble [12,114 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ name : chr [1:12114] "Gared" "Waymar Royce" "Will" "Gared" ...
$ sceneId: num [1:12114] 1 1 1 2 2 2 3 3 4 5 ...
- attr(*, "spec")=
.. cols(
.. name = col_character(),
.. sceneId = col_double()
.. )
sum(scenes$nbdeath)[1] 376
sum(scenes$nbdeath[(scenes$episodeId <= sum(episodes$seasonNum == 1))])[1] 32
sum(scenes$nbdeath[scenes$episodeId %in% episodes$episodeId[which(episodes$seasonNum == 1)]])[1] 32
sort(table(characters$killedBy),TRUE)[1:5]
Jon Snow Daenerys Targaryen Arya Stark Sandor Clegane
12 11 10 9
Cersei Lannister
7
scenes[which.max(scenes$duration),]# A tibble: 1 x 9
sceneStart sceneEnd location subLocation episodeId duration nbc sceneId nbdeath
<time> <time> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 45'58" 56'59" The Crownl… Outside King's… 73 661 18 3795 0
scenes %>%
arrange(desc(duration)) %>%
head(1) %>%
select(duration, episodeId)# A tibble: 1 x 2
duration episodeId
<dbl> <dbl>
1 661 73
scenes %>%
arrange(desc(duration)) %>%
head(1) %>%
left_join(appearances, by=c("sceneId")) %>%
select(name)# A tibble: 18 x 1
name
<chr>
1 Tyrion Lannister
2 Grey Worm
3 Samwell Tarly
4 Edmure Tully
5 Arya Stark
6 Bran Stark
7 Sansa Stark
8 Brienne of Tarth
9 Davos Seaworth
10 Gendry
11 Yara Greyjoy
12 Robin Arryn
13 Yohn Royce
14 Dornish Lord
15 Lord #1
16 Lord #2
17 Lord #3
18 Lord #4
scenes %>%
group_by(location) %>%
summarise(nbscenes = n()) %>%
arrange(desc(nbscenes)) %>%
head(1) %>%
select(location)# A tibble: 1 x 1
location
<chr>
1 The Crownlands
scenes %>%
filter(location == "Port Real") %>%
group_by(location) %>%
summarise(nbscenes = n())# A tibble: 0 x 2
# … with 2 variables: location <chr>, nbscenes <int>
scenes %>%
group_by(subLocation) %>%
summarise(nd = sum(nbdeath)) %>%
arrange(desc(nd))# A tibble: 97 x 2
subLocation nd
<chr> <dbl>
1 King's Landing 74
2 Winterfell 60
3 <NA> 51
4 Castle Black 41
5 The Twins 12
6 The Haunted Forest 9
7 The Wall 9
8 Craster's Keep 8
9 The Wolfswood 8
10 Dragonstone 6
# … with 87 more rows
appearances %>%
filter(name == "Jon Snow") %>%
left_join(scenes) %>%
left_join(episodes) %>%
group_by(name, episodeId, episodeTitle) %>%
summarise(screenTime = sum(duration)) %>%
arrange(desc(screenTime)) %>%
head(1)# A tibble: 1 x 4
# Groups: name, episodeId [1]
name episodeId episodeTitle screenTime
<chr> <dbl> <chr> <dbl>
1 Jon Snow 73 The Iron Throne 2526
appearances %>%
left_join(scenes) %>%
group_by(name) %>%
summarise(screenTime = sum(duration)) %>%
filter(screenTime>30*60) %>%
nrow()[1] 98
appearances %>%
left_join(appearances, by=c("sceneId"="sceneId")) %>%
filter(name.x != name.y) %>%
group_by(name.x,name.y) %>%
summarise(nbscenes = n()) %>%
arrange(desc(nbscenes))# A tibble: 8,472 x 3
# Groups: name.x [576]
name.x name.y nbscenes
<chr> <chr> <int>
1 Daenerys Targaryen Drogon 172
2 Drogon Daenerys Targaryen 172
3 Daenerys Targaryen Jorah Mormont 148
4 Jorah Mormont Daenerys Targaryen 148
5 Jon Snow Tormund Giantsbane 145
6 Tormund Giantsbane Jon Snow 145
7 Lord Varys Tyrion Lannister 141
8 Tyrion Lannister Lord Varys 141
9 Davos Seaworth Jon Snow 136
10 Jon Snow Davos Seaworth 136
# … with 8,462 more rows
appearances %>%
left_join(appearances, by=c("sceneId"="sceneId")) %>%
filter(name.x != name.y) %>%
left_join(scenes %>% select(sceneId,duration)) %>%
group_by(name.x,name.y) %>%
summarise(screenTime = sum(duration)) %>%
arrange(desc(screenTime))# A tibble: 8,472 x 3
# Groups: name.x [576]
name.x name.y screenTime
<chr> <chr> <dbl>
1 Daenerys Targaryen Jorah Mormont 12923
2 Jorah Mormont Daenerys Targaryen 12923
3 Lord Varys Tyrion Lannister 10764
4 Tyrion Lannister Lord Varys 10764
5 Davos Seaworth Jon Snow 10380
6 Jon Snow Davos Seaworth 10380
7 Daenerys Targaryen Missandei 9924
8 Missandei Daenerys Targaryen 9924
9 Jon Snow Tormund Giantsbane 9352
10 Tormund Giantsbane Jon Snow 9352
# … with 8,462 more rows
df <- scenes %>%
left_join(appearances) %>%
group_by(name,location) %>%
summarise(screenTime = sum(duration)) %>%
pivot_wider(values_from=screenTime,names_from=location,values_fill=c("duration"=0))
df# A tibble: 577 x 27
# Groups: name [577]
name `The Riverlands` `The North` `The Iron Islan… `North of the W… `The Red Waste`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Adda… 76 0 0 0 0
2 Adel… 0 36 0 0 0
3 Adra… 0 190 0 0 0
4 Aero… 0 0 601 0 0
5 Aery… 0 0 0 23 0
6 Aggo 0 0 0 0 185
7 Akho 0 0 0 0 0
8 Alli… 0 0 0 0 0
9 Alto… 92 0 0 0 0
10 Alys… 0 1050 0 0 0
# … with 567 more rows, and 21 more variables: `The Dothraki Sea` <dbl>, `Vaes
# Dothrak` <dbl>, `The Wall` <dbl>, `The Crownlands` <dbl>, `The Westerlands` <dbl>,
# Braavos <dbl>, `The Vale` <dbl>, `The Reach` <dbl>, Dorne <dbl>, `The Sunset
# Sea` <dbl>, Meereen <dbl>, Astapor <dbl>, Yunkai <dbl>, `The Stormlands` <dbl>,
# Volantis <dbl>, Pentos <dbl>, Qarth <dbl>, `The Narrow Sea` <dbl>, `The Shivering
# Sea` <dbl>, `The Summer Sea` <dbl>, Valyria <dbl>
X <- as.matrix(df[,-1])
Xs <- X[rowSums(X)>3600,]
Xns <- Xs/rowSums(Xs)
rownames(Xns) <- df$name[rowSums(X)>3600]
Xns The Riverlands The North The Iron Islands North of the Wall
Alliser Thorne 0.000000000 0.00000000 0.0000000 0.0000000000
Arya Stark 0.365864693 0.22249640 0.0000000 0.0000000000
Barristan Selmy 0.000000000 0.00000000 0.0000000 0.0000000000
The Red Waste The Dothraki Sea Vaes Dothrak The Wall
Alliser Thorne 0.00000000 0.00000000 0.00000000 1.000000000
Arya Stark 0.00000000 0.00000000 0.00000000 0.000000000
Barristan Selmy 0.00000000 0.00000000 0.00000000 0.000000000
The Crownlands The Westerlands Braavos The Vale The Reach
Alliser Thorne 0.00000000 0.000000000 0.00000000 0.00000000 0.00000000
Arya Stark 0.16981287 0.000000000 0.20608678 0.03240798 0.00000000
Barristan Selmy 0.29420709 0.000000000 0.00000000 0.00000000 0.00000000
Dorne The Sunset Sea Meereen Astapor Yunkai
Alliser Thorne 0.0000000000 0.000000000 0.00000000 0.00000000 0.00000000
Arya Stark 0.0000000000 0.003331277 0.00000000 0.00000000 0.00000000
Barristan Selmy 0.0000000000 0.000000000 0.38680249 0.13744868 0.18154174
The Stormlands Volantis Pentos Qarth The Narrow Sea
Alliser Thorne 0.000000000 0.00000000 0.00000000 0.00000000 0.0000000000
Arya Stark 0.000000000 0.00000000 0.00000000 0.00000000 0.0000000000
Barristan Selmy 0.000000000 0.00000000 0.00000000 0.00000000 0.0000000000
The Shivering Sea The Summer Sea Valyria
Alliser Thorne 0.000000000 0.0000000000 0.000000000
Arya Stark 0.000000000 0.0000000000 0.000000000
Barristan Selmy 0.000000000 0.0000000000 0.000000000
[ reached getOption("max.print") -- omitted 52 rows ]
hc <- hclust(dist(Xns,method="manhattan"))
X11()
plot(hc, main = "Clustering of the main characters (geographical profiles)", sub = "@roortheroor, 2020", xlab = "")capture <- tk_messageBox(message = "close plot", detail = "")